This report looks finding intron retention genes using simple summary
statistics and visualisations using superintronic. We also compare the results to those
obtained using IsoFormSwitchAnalyzeR.
We will use data from 6 samples from the human cell line mixture RNA-seq data. There are three biological replicates across two library preparation kits, a polyA kit and a total RNA kit. The BAM files have been previously aligned with subread using the hg20 reference genome.
design <- read.table(
here::here("data-raw", "targets.txt"),
header = TRUE,
stringsAsFactors = FALSE
) %>%
S4Vectors::DataFrame() %>%
transform(
File = S4Vectors::Rle(here::here("data-raw", sub("\\.", "-", File)))
) %>%
BiocGenerics::subset(Replicate %in% c("R1", "R2", "R3") & Mixture == 0,
select = c(File, Mixture, Replicate, Kit)) %>%
S4Vectors::transform(
CellLine = "HCC287",
Kit = S4Vectors::Rle(ifelse(Kit == "mRNA", "polyA", "total-RNA"))
) %>%
S4Vectors::transform(
Sample = paste(Replicate, CellLine, Kit, sep = "_")
)
design
#> DataFrame with 6 rows and 6 columns
#> File
#> <Rle>
#> 1 /Volumes/lee.s/intron_report/data-raw/R1-000_C5N4YANXX_GATCAGAT_R1.bam
#> 2 /Volumes/lee.s/intron_report/data-raw/R2-000_C5N4YANXX_AGTTCCGT_R1.bam
#> 3 /Volumes/lee.s/intron_report/data-raw/R3-000_C5N4YANXX_GTGGCCTT_R1.bam
#> 4 /Volumes/lee.s/intron_report/data-raw/R1-000_C5N4YANXX_ACAGTGAT_R1.bam
#> 5 /Volumes/lee.s/intron_report/data-raw/R2-000_C5N4YANXX_TAGCTTAT_R1.bam
#> 6 /Volumes/lee.s/intron_report/data-raw/R3-000_C5N4YANXX_ATGTCAGA_R1.bam
#> Mixture Replicate Kit CellLine Sample
#> <integer> <character> <Rle> <character> <character>
#> 1 0 R1 polyA HCC287 R1_HCC287_polyA
#> 2 0 R2 polyA HCC287 R2_HCC287_polyA
#> 3 0 R3 polyA HCC287 R3_HCC287_polyA
#> 4 0 R1 total-RNA HCC287 R1_HCC287_total-RNA
#> 5 0 R2 total-RNA HCC287 R2_HCC287_total-RNA
#> 6 0 R3 total-RNA HCC287 R3_HCC287_total-RNA
superintronic analysisA superintronic analysis consists of four steps: 1. Preparing the annotation and extract exonic/intronic parts 2. Computing coverage over regions of interest 3. Summarising coverage as rangenostics 4. Visualising the results
library(superintronic)
gff <- here::here("data-raw", "gencode.v27.annotation.gtf.gz")
ref <- "hg38"
# uncomment if no internet connection
# ref <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 %>%
# get_genome_info() %>%
# GenomeInfoDb::keepStandardChromosomes("Homo sapiens", "coarse")
gr_gff <- read_gff(gff, genome_info = ref)
parts <- collect_parts(gr_gff)
parts
#> GRanges object with 58288 ranges and 8 metadata columns:
#> seqnames ranges strand | gene_id
#> <Rle> <IRanges> <Rle> | <Rle>
#> [1] chrX 100627109-100639991 - | ENSG00000000003.14
#> [2] chrX 100584802-100599885 + | ENSG00000000005.5
#> [3] chr20 50934867-50958555 - | ENSG00000000419.12
#> [4] chr1 169849631-169894267 - | ENSG00000000457.13
#> [5] chr1 169662007-169854080 + | ENSG00000000460.16
#> ... ... ... ... . ...
#> [58284] chr1 6767954-6770038 + | ENSG00000284744.1
#> [58285] chr1 2960658-2968707 - | ENSG00000284745.1
#> [58286] chr8 12601158-12601376 - | ENSG00000284746.1
#> [58287] chr1 7991134-8005312 + | ENSG00000284747.1
#> [58288] chr1 37596126-37607336 + | ENSG00000284748.1
#> gene_type gene_name type source
#> <character> <character> <factor> <factor>
#> [1] protein_coding TSPAN6 gene HAVANA
#> [2] protein_coding TNMD gene HAVANA
#> [3] protein_coding DPM1 gene HAVANA
#> [4] protein_coding SCYL3 gene HAVANA
#> [5] protein_coding C1orf112 gene HAVANA
#> ... ... ... ... ...
#> [58284] lincRNA AL591163.1 gene HAVANA
#> [58285] lincRNA AL589702.1 gene HAVANA
#> [58286] transcribed_unprocessed_pseudogene AC068587.10 gene HAVANA
#> [58287] antisense_RNA AL034417.4 gene HAVANA
#> [58288] bidirectional_promoter_lncRNA AL513220.1 gene HAVANA
#> n_olaps
#> <integer>
#> [1] 1
#> [2] 1
#> [3] 2
#> [4] 3
#> [5] 6
#> ... ...
#> [58284] 1
#> [58285] 1
#> [58286] 2
#> [58287] 3
#> [58288] 1
#> exonic_parts
#> <GRangesList>
#> [1] chrX:100627109-100629986:-,chrX:100630759-100630866:-,chrX:100632063-100632068:-,...
#> [2] chrX:100584802-100585066:+,chrX:100585231-100585362:+,chrX:100593624-100594035:+,...
#> [3] chr20:50934867-50935236:-,chr20:50936148-50936262:-,chr20:50940865-50940955:-,...
#> [4] chr1:169849631-169853772:-,chr1:169854270-169854964:-,chr1:169855796-169855957:-,...
#> [5] chr1:169662007-169662523:+,chr1:169683469-169683625:+,chr1:169683756-169683932:+,...
#> ... ...
#> [58284] chr1:6767954-6768762:+,chr1:6769727-6769776:+,chr1:6769911-6770038:+
#> [58285] chr1:2960658-2961585:-,chr1:2961672-2962100:-,chr1:2963264-2963415:-,...
#> [58286] chr8:12601158-12601376:-
#> [58287] chr1:7991134-7991447:+,chr1:7994547-7994664:+,chr1:7996321-7996443:+,...
#> [58288] chr1:37596126-37596338:+,chr1:37606224-37607336:+
#> intronic_parts
#> <GRangesList>
#> [1] chrX:100629987-100630758:-,chrX:100630867-100632062:-,chrX:100632069-100632484:-,...
#> [2] chrX:100585067-100585230:+,chrX:100585363-100593623:+,chrX:100594036-100594260:+,...
#> [3] chr20:50935237-50936147:-,chr20:50936263-50940864:-,chr20:50940956-50941104:-,...
#> [4] chr1:169853773-169854269:-,chr1:169854965-169855795:-,chr1:169855958-169859040:-,...
#> [5] chr1:169662524-169683468:+,chr1:169683626-169683755:+,chr1:169683933-169783810:+,...
#> ... ...
#> [58284] chr1:6768763-6769726:+,chr1:6769777-6769910:+
#> [58285] chr1:2961586-2961671:-,chr1:2962101-2963263:-,chr1:2963416-2968562:-
#> [58286]
#> [58287] chr1:7991448-7994546:+,chr1:7994665-7996320:+,chr1:7996444-7999539:+,...
#> [58288] chr1:37596339-37606223:+
#> -------
#> seqinfo: 595 sequences from hg38 genome
These can be interrogated as desired, but for our purposes we will keep genes that are protein coding, on main chromosomes but not mitochondrial genome, and do not overlap any other genes and have more than one exon.
parts_sub <- parts %>%
filter(
gene_type == "protein_coding",
n_olaps == 1,
seqnames != "chrM",
lengths(exonic_parts) > 1
) %>%
GenomeInfoDb::keepStandardChromosomes("Homo sapiens", "coarse") %>%
GenomeInfoDb::dropSeqlevels("chrM", "coarse")
parts_sub
#> GRanges object with 6606 ranges and 8 metadata columns:
#> seqnames ranges strand | gene_id
#> <Rle> <IRanges> <Rle> | <Rle>
#> [1] chrX 100627109-100639991 - | ENSG00000000003.14
#> [2] chrX 100584802-100599885 + | ENSG00000000005.5
#> [3] chr1 27612064-27635277 - | ENSG00000000938.12
#> [4] chr1 196651878-196747504 + | ENSG00000000971.15
#> [5] chrX 65512582-65534775 - | ENSG00000001497.16
#> ... ... ... ... . ...
#> [6602] chr19 35586260-35587296 + | ENSG00000283758.1
#> [6603] chr16 1431035-1433397 + | ENSG00000284395.1
#> [6604] chr11 124395534-124399024 - | ENSG00000284609.1
#> [6605] chr19 3482804-3483446 - | ENSG00000284638.1
#> [6606] chr11 124382394-124384462 - | ENSG00000284680.1
#> gene_type gene_name type source n_olaps
#> <character> <character> <factor> <factor> <integer>
#> [1] protein_coding TSPAN6 gene HAVANA 1
#> [2] protein_coding TNMD gene HAVANA 1
#> [3] protein_coding FGR gene HAVANA 1
#> [4] protein_coding CFH gene HAVANA 1
#> [5] protein_coding LAS1L gene HAVANA 1
#> ... ... ... ... ... ...
#> [6602] protein_coding AC002115.2 gene HAVANA 1
#> [6603] protein_coding AL032819.3 gene HAVANA 1
#> [6604] protein_coding OR8B3 gene HAVANA 1
#> [6605] protein_coding AC005551.1 gene HAVANA 1
#> [6606] protein_coding OR8B2 gene HAVANA 1
#> exonic_parts
#> <GRangesList>
#> [1] chrX:100627109-100629986:-,chrX:100630759-100630866:-,chrX:100632063-100632068:-,...
#> [2] chrX:100584802-100585066:+,chrX:100585231-100585362:+,chrX:100593624-100594035:+,...
#> [3] chr1:27612064-27613122:-,chr1:27613219-27613350:-,chr1:27614430-27614583:-,...
#> [4] chr1:196651878-196652175:+,chr1:196672978-196673407:+,chr1:196673857-196673962:+,...
#> [5] chrX:65512582-65512901:-,chrX:65514823-65514973:-,chrX:65517987-65518465:-,...
#> ... ...
#> [6602] chr19:35586260-35586541:+,chr19:35586976-35587296:+
#> [6603] chr16:1431035-1431108:+,chr16:1432547-1433397:+
#> [6604] chr11:124395534-124397368:-,chr11:124398690-124399024:-
#> [6605] chr19:3482804-3483051:-,chr19:3483202-3483446:-
#> [6606] chr11:124382394-124383360:-,chr11:124384374-124384462:-
#> intronic_parts
#> <GRangesList>
#> [1] chrX:100629987-100630758:-,chrX:100630867-100632062:-,chrX:100632069-100632484:-,...
#> [2] chrX:100585067-100585230:+,chrX:100585363-100593623:+,chrX:100594036-100594260:+,...
#> [3] chr1:27613123-27613218:-,chr1:27613351-27614429:-,chr1:27614584-27614849:-,...
#> [4] chr1:196652176-196672977:+,chr1:196673408-196673856:+,chr1:196673963-196675988:+,...
#> [5] chrX:65512902-65514822:-,chrX:65514974-65517986:-,chrX:65518466-65520428:-,...
#> ... ...
#> [6602] chr19:35586542-35586975:+
#> [6603] chr16:1431109-1432546:+
#> [6604] chr11:124397369-124398689:-
#> [6605] chr19:3483052-3483201:-
#> [6606] chr11:124383361-124384373:-
#> -------
#> seqinfo: 24 sequences from hg38 genome
Now we can take our BAM files and compute coverage. This
produces a large GRanges object, with two metadata columns,
source which is the BAM file that coverage was computed for, and score
which is the coverage score. By default, this function computes
the coverage scores in parallel using BiocParallel.
cvg <- compute_coverage_long(design,
source = "File",
.which = get_genome_info(parts_sub),
.genome_info = get_genome_info(parts_sub),
.parallel = BiocParallel::MulticoreParam())
# transform to save space and drop redundant labels
cvg <- cvg %>%
select(-File, -CellLine) %>%
mutate(Replicate = as(Replicate, "Rle"),
Sample = as(Sample, "Rle"))
#> GRanges object with 162505390 ranges and 5 metadata columns:
#> seqnames ranges strand | Mixture Replicate
#> <Rle> <IRanges> <Rle> | <integer> <Rle>
#> [1] chr10 1-48181 * | 0 R1
#> [2] chr10 48182-48281 * | 0 R1
#> [3] chr10 48282-62897 * | 0 R1
#> [4] chr10 62898-62997 * | 0 R1
#> [5] chr10 62998-70639 * | 0 R1
#> ... ... ... ... . ... ...
#> [162505386] chrY 57212137 * | 0 R3
#> [162505387] chrY 57212138-57212149 * | 0 R3
#> [162505388] chrY 57212150-57212176 * | 0 R3
#> [162505389] chrY 57212177-57212181 * | 0 R3
#> [162505390] chrY 57212182-57227415 * | 0 R3
#> Kit Sample score
#> <Rle> <Rle> <integer>
#> [1] polyA R1_HCC287_polyA 0
#> [2] polyA R1_HCC287_polyA 1
#> [3] polyA R1_HCC287_polyA 0
#> [4] polyA R1_HCC287_polyA 1
#> [5] polyA R1_HCC287_polyA 0
#> ... ... ... ...
#> [162505386] total-RNA R3_HCC287_total-RNA 6
#> [162505387] total-RNA R3_HCC287_total-RNA 3
#> [162505388] total-RNA R3_HCC287_total-RNA 2
#> [162505389] total-RNA R3_HCC287_total-RNA 1
#> [162505390] total-RNA R3_HCC287_total-RNA 0
#> -------
#> seqinfo: 24 sequences from hg38 genome
Now all the ingredients are in place to merge the coverage scores
to our prepared annotation with join_parts(). This again
returns a GRanges object, restricted to the intersection of the
coverage ranges with intron/exon ranges. Additional columns are added
corresponding to the properties of the intron/exon.
cvg_over_features <- join_parts(cvg, parts_sub)
Now we will compute summaries (rangenostics) for the polyA Kit to detect IR genes. This consists of three steps
polyA_features <- cvg_over_features %>%
plyranges::filter(Kit == "polyA")
# compute intron/exon features
res <- polyA_features %>%
mutate(log_score = log2(score + 0.5)) %>%
group_by(gene_id, feature_type) %>%
summarise(feature_average = Hmisc::wtd.mean(log_score, width),
feature_sd = sqrt(Hmisc::wtd.var(log_score, width)),
feature_length = sum(BiocGenerics::unique(feature_length)) * n_distinct(Replicate),
score = score,
n_bases = width,
start = min(start),
end = max(end),
seqnames = unlist(BiocGenerics::unique(seqnames))
) %>%
GenomicRanges::makeGRangesFromDataFrame(keep.extra.columns = TRUE)
res
#> GRanges object with 13212 ranges and 7 metadata columns:
#> seqnames ranges strand | gene_id
#> <Rle> <IRanges> <Rle> | <Rle>
#> [1] chrX 100627109-100639991 * | ENSG00000000003.14
#> [2] chrX 100584802-100599885 * | ENSG00000000005.5
#> [3] chr1 27612064-27635277 * | ENSG00000000938.12
#> [4] chr1 196651878-196747504 * | ENSG00000000971.15
#> [5] chrX 65512582-65534775 * | ENSG00000001497.16
#> ... ... ... ... . ...
#> [13208] chr19 35586542-35586975 * | ENSG00000283758.1
#> [13209] chr16 1431109-1432546 * | ENSG00000284395.1
#> [13210] chr11 124397369-124398689 * | ENSG00000284609.1
#> [13211] chr19 3483052-3483201 * | ENSG00000284638.1
#> [13212] chr11 124383361-124384373 * | ENSG00000284680.1
#> feature_type feature_average feature_sd feature_length
#> <Rle> <numeric> <numeric> <integer>
#> [1] exon 3.91015736226114 3.06458115204434 13605
#> [2] exon -1 0 4830
#> [3] exon -1 0 10422
#> [4] exon 1.42147191842735 1.96161951238606 20664
#> [5] exon 4.98861200665988 2.30861757881119 16767
#> ... ... ... ... ...
#> [13208] intron -1 0 1302
#> [13209] intron -0.841283310080774 0.475837681759214 4314
#> [13210] intron -1 0 3963
#> [13211] intron -1 0 450
#> [13212] intron -1 0 3039
#> score n_bases
#> <IntegerList> <IntegerList>
#> [1] 0,1,2,... 46,17,1,...
#> [2] 0,0,0,... 265,132,412,...
#> [3] 0,0,0,... 1059,132,154,...
#> [4] 0,1,2,... 180,6,3,...
#> [5] 2,4,6,... 7,1,3,...
#> ... ... ...
#> [13208] 0,0,0 434,434,434
#> [13209] 1,0,1,... 32,233,100,...
#> [13210] 0,0,0 1321,1321,1321
#> [13211] 0,0,0 150,150,150
#> [13212] 0,0,0 1013,1013,1013
#> -------
#> seqinfo: 24 sequences from an unspecified genome; no seqlengths
E_val <- res %>%
filter(feature_type == "exon") %>%
filter(feature_sd > 0) %>%
summarise(
E_mu = mean(feature_average),
E_sigma = mean(feature_sd),
E_raw = mean(sum(score*n_bases) / sum(n_bases))
)
E_val
#> DataFrame with 1 row and 3 columns
#> E_mu E_sigma E_raw
#> <numeric> <numeric> <numeric>
#> 1 1.99822400568595 1.55589298904874 36.2833426201313
mcols(res) <- cbind(mcols(res), E_val)
rango <- res %>%
mutate(
n_bases_E_raw = sum(n_bases[score > E_raw]),
prop_bases_E_raw = n_bases_E_raw / sum(n_bases)
) %>%
plyranges::select(-n_bases, -score, -seqnames, -start, -end, -width, -strand,
.drop_ranges = TRUE) %>%
dplyr::as_tibble() %>%
group_by(feature_type) %>%
tidyr::nest() %>%
mutate(
smooth = lapply(data, function(x) {
mgcv::gam(feature_sd ~ s(feature_average), data = x)
}),
augment = lapply(smooth, broom::augment),
) %>%
tidyr::unnest(data, augment) %>%
dplyr::select(-dplyr::ends_with("1"), -.sigma)
We get very similar results to a voom plot in a standard limma analysis. We
have also overlaid on the E_mu and E_sigma values as potential cut-offs for
each feature.
# rangenostics feature_average vs feature_sd coloured by n of bases above
# average raw exon coverage
library(ggplot2)
ggplot(rango, aes(x = feature_average,
y = feature_sd)) +
geom_point() +
geom_line(aes(y = .fitted), colour = "blue") +
geom_vline(aes(xintercept = E_mu),
data = dplyr::distinct(rango, feature_type, E_mu)) +
geom_hline(aes(yintercept = E_sigma),
data = dplyr::distinct(rango, feature_type, E_sigma)) +
facet_wrap(~feature_type)
We also plot pairs of rangenositcs for each feature in a scatter plot matrix (using a hexbin scatter plot). Again we get similar results to the plots produced by a DE analysis.
all_features <- tidyr::gather(rango, "key", "value",
-gene_id, -feature_type, -E_mu, -E_sigma) %>%
mutate(var = paste0(feature_type, "_", key)) %>%
plyranges::select(-feature_type, -key) %>%
arrange(gene_id) %>%
tidyr::spread(var, value)
all_features %>%
plyranges::select(gene_id, exon_feature_average,
intron_feature_average, intron_feature_sd, intron_n_bases_E_raw) %>%
GGally::ggpairs(columns = 2:5, lower = list(continuous = hex))
hits <- all_features %>%
filter(exon_feature_average > E_mu,
intron_feature_sd > E_sigma,
intron_n_bases_E_raw > quantile(intron_n_bases_E_raw, 0.99)) %>%
group_by(gene_id)
hits
#> # A tibble: 43 x 25
#> # Groups: gene_id [43]
#> gene_id E_mu E_sigma exon_.cooksd exon_.fitted exon_.hat exon_.resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ENSG00… 2.00 1.56 0.00132 2.34 0.00371 0.950
#> 2 ENSG00… 2.00 1.56 0.00000198 2.35 0.00394 0.0357
#> 3 ENSG00… 2.00 1.56 0.00242 2.36 0.00466 1.15
#> 4 ENSG00… 2.00 1.56 0.00000365 2.21 0.00135 0.0831
#> 5 ENSG00… 2.00 1.56 0.000156 2.30 0.00264 -0.388
#> 6 ENSG00… 2.00 1.56 0.000179 2.13 0.00127 0.599
#> 7 ENSG00… 2.00 1.56 0.0000560 2.30 0.00263 0.233
#> 8 ENSG00… 2.00 1.56 0.000483 2.22 0.00130 0.972
#> 9 ENSG00… 2.00 1.56 0.000775 2.34 0.00569 -0.587
#> 10 ENSG00… 2.00 1.56 0.00453 2.31 0.00268 2.07
#> # … with 33 more rows, and 18 more variables: exon_.se.fit <dbl>,
#> # exon_E_raw <dbl>, exon_feature_average <dbl>,
#> # exon_feature_length <dbl>, exon_feature_sd <dbl>,
#> # exon_n_bases_E_raw <dbl>, exon_prop_bases_E_raw <dbl>,
#> # intron_.cooksd <dbl>, intron_.fitted <dbl>, intron_.hat <dbl>,
#> # intron_.resid <dbl>, intron_.se.fit <dbl>, intron_E_raw <dbl>,
#> # intron_feature_average <dbl>, intron_feature_length <dbl>,
#> # intron_feature_sd <dbl>, intron_n_bases_E_raw <dbl>,
#> # intron_prop_bases_E_raw <dbl>
nrow(hits) genes.hits %>%
dplyr::group_walk(save_plot)
#> # A tibble: 43 x 25
#> # Groups: gene_id [43]
#> gene_id E_mu E_sigma exon_.cooksd exon_.fitted exon_.hat exon_.resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ENSG00… 2.00 1.56 0.00132 2.34 0.00371 0.950
#> 2 ENSG00… 2.00 1.56 0.00000198 2.35 0.00394 0.0357
#> 3 ENSG00… 2.00 1.56 0.00242 2.36 0.00466 1.15
#> 4 ENSG00… 2.00 1.56 0.00000365 2.21 0.00135 0.0831
#> 5 ENSG00… 2.00 1.56 0.000156 2.30 0.00264 -0.388
#> 6 ENSG00… 2.00 1.56 0.000179 2.13 0.00127 0.599
#> 7 ENSG00… 2.00 1.56 0.0000560 2.30 0.00263 0.233
#> 8 ENSG00… 2.00 1.56 0.000483 2.22 0.00130 0.972
#> 9 ENSG00… 2.00 1.56 0.000775 2.34 0.00569 -0.587
#> 10 ENSG00… 2.00 1.56 0.00453 2.31 0.00268 2.07
#> # … with 33 more rows, and 18 more variables: exon_.se.fit <dbl>,
#> # exon_E_raw <dbl>, exon_feature_average <dbl>,
#> # exon_feature_length <dbl>, exon_feature_sd <dbl>,
#> # exon_n_bases_E_raw <dbl>, exon_prop_bases_E_raw <dbl>,
#> # intron_.cooksd <dbl>, intron_.fitted <dbl>, intron_.hat <dbl>,
#> # intron_.resid <dbl>, intron_.se.fit <dbl>, intron_E_raw <dbl>,
#> # intron_feature_average <dbl>, intron_feature_length <dbl>,
#> # intron_feature_sd <dbl>, intron_n_bases_E_raw <dbl>,
#> # intron_prop_bases_E_raw <dbl>
The Bioconductor package IsoformSwitchAnalyzeR also allows for the exploration
of intron retention events using a similar workflow:
We used kallisto quant on single read mode with an average fragment size of 200 and a standard deviation of 20, on all fastq files for each replicate in the polyA/total RNA kits. We need to use both here since IsoformSwitchAnalzyeR requires biological replicates for testing for switching events.
library(IsoformSwitchAnalyzeR)
quants <- importIsoformExpression(
parentDir = here::here("data", "kallisto/"),
)
switchList by importing annotations
and design matrix.# use a simplified version of the design
design_isf <- data.frame(sampleID = colnames(quants$abundance[,-1])) %>%
mutate(condition = rep(c("polyA", "total-RNA"), 3),
replicate = rep(c("R1", "R2", "R3"), each = 2)
)
isf <- importRdata(
isoformCountMatrix = quants$counts,
isoformRepExpression = quants$abundance,
designMatrix = design_isf,
isoformExonAnnoation = here::here("data-raw", "gencode.v27.annotation.gtf.gz"),
isoformNtFasta = here::here("data-raw", "gencode.v27.transcripts.fa.gz"),
removeNonConvensionalChr = TRUE,
)
#>
|
| | 0%
|
|================================ | 50%
|
|=================================================================| 100%
#>
|
| | 0%
|
|=================================================================| 100%
isf
#> This switchAnalyzeRlist list contains:
#> 148002 isoforms from 37959 genes
#> 1 comparison from 2 conditions (in total 6 samples)
#>
#> Feature analyzed:
#> [1] "ORFs, ntSequence"
Next we prefilter, the isoforms using roughly the same filters we used in the superintronic analysis above. To speed things up we could further specify a gene expression filter, but we have left that out for now.
isf_filtered <- preFilter(isf,
acceptedGeneBiotype = "protein_coding",
removeSingleIsoformGenes = TRUE,
geneExpressionCutoff = 1,
isoformExpressionCutoff = 0.1,
keepIsoformInAllConditions = TRUE
)
isf_filtered
#> This switchAnalyzeRlist list contains:
#> 76882 isoforms from 11186 genes
#> 1 comparison from 2 conditions (in total 6 samples)
#>
#> Feature analyzed:
#> [1] "ORFs, ntSequence"
First we are required to run DEXSeq to test for isoform switching ( this does all pairwise comparisons between conditions, which slows things down quite a bit.) and between the two kits. We have used quite a small alpha since DEXSeq seems quite liberal…
isf_analysed <- isoformSwitchTestDEXSeq(isf_filtered,
reduceToSwitchingGenes = TRUE,
alpha = 0.01,
dIFcutoff = 0.15)
isf_analysed
#> This switchAnalyzeRlist list contains:
#> 15324 isoforms from 2366 genes
#> 1 comparison from 2 conditions (in total 6 samples)
#>
#> Switching features:
#> Comparison switchingIsoforms switchingGenes
#> 1 polyA vs total_RNA 5440 2366
#>
#> Feature analyzed:
#> [1] "Isoform Switch Identification, ORFs, ntSequence"
Now we can analyse intron retention events from one kit relative to another:
# this takes quite a while to compute...
isf_analysed <- analyzeIntronRetention(isf_analysed)
#>
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
isf_analysed <- analyzeSwitchConsequences(isf_analysed, "intron_retention")
#>
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
# the result
isf_analysed
#> This switchAnalyzeRlist list contains:
#> 15324 isoforms from 2366 genes
#> 1 comparison from 2 conditions (in total 6 samples)
#>
#> Switching features:
#> Comparison switchingIsoforms switchingGenes
#> 1 polyA vs total_RNA 5440 2366
#>
#> Feature analyzed:
#> [1] "Isoform Switch Identification, ORFs, ntSequence, Intron Retentions, Switch Consequences"
# isoforms with intron retention events
isf_analysed$switchConsequence %>%
dplyr::count(switchConsequence)
#> # A tibble: 3 x 2
#> switchConsequence n
#> <chr> <int>
#> 1 Intron retention gain 255
#> 2 Intron retention loss 470
#> 3 Intron retention switch 24
And the results can be summarised at the gene level and compared to the superintronic analysis.
At the gene level there are around 502 genes declared significant at 0.01 fdr cut-off between the polyA and total-RNA kits that are filtered for intron retention events.
is_res <- extractTopSwitches(isf_analysed,
filterForConsequences = TRUE,
n = NA) %>%
dplyr::as_tibble()
is_res
#> # A tibble: 502 x 8
#> gene_ref gene_id gene_name condition_1 condition_2 gene_switch_q_v…
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 geneCom… ENSG00… PSMA6 polyA total_RNA 0.
#> 2 geneCom… ENSG00… DHCR24 polyA total_RNA 0.
#> 3 geneCom… ENSG00… RPL21 polyA total_RNA 0.
#> 4 geneCom… ENSG00… RPS27A polyA total_RNA 0.
#> 5 geneCom… ENSG00… RPS7 polyA total_RNA 0.
#> 6 geneCom… ENSG00… CALR polyA total_RNA 0.
#> 7 geneCom… ENSG00… S100A6 polyA total_RNA 0.
#> 8 geneCom… ENSG00… C19orf53 polyA total_RNA 4.84e-306
#> 9 geneCom… ENSG00… TRAPPC5 polyA total_RNA 3.62e-293
#> 10 geneCom… ENSG00… RPL28 polyA total_RNA 4.78e-291
#> # … with 492 more rows, and 2 more variables:
#> # switchConsequencesGene <lgl>, Rank <int>
We can also look at the coverage plots for these genes (we will choose the top 20 genes):
is_res %>%
filter(Rank < 40) %>%
group_by(gene_id) %>%
dplyr::group_walk(save_plot_isr)
#> # A tibble: 39 x 8
#> # Groups: gene_id [39]
#> gene_ref gene_id gene_name condition_1 condition_2 gene_switch_q_v…
#> <chr> <chr> <chr> <chr> <chr> <dbl>
#> 1 geneCom… ENSG00… PSMA6 polyA total_RNA 0.
#> 2 geneCom… ENSG00… DHCR24 polyA total_RNA 0.
#> 3 geneCom… ENSG00… RPL21 polyA total_RNA 0.
#> 4 geneCom… ENSG00… RPS27A polyA total_RNA 0.
#> 5 geneCom… ENSG00… RPS7 polyA total_RNA 0.
#> 6 geneCom… ENSG00… CALR polyA total_RNA 0.
#> 7 geneCom… ENSG00… S100A6 polyA total_RNA 0.
#> 8 geneCom… ENSG00… C19orf53 polyA total_RNA 4.84e-306
#> 9 geneCom… ENSG00… TRAPPC5 polyA total_RNA 3.62e-293
#> 10 geneCom… ENSG00… RPL28 polyA total_RNA 4.78e-291
#> # … with 29 more rows, and 2 more variables: switchConsequencesGene <lgl>,
#> # Rank <int>
Comparing to the previous analysis with superintronic, there is no overlap between the two methods. (Of the genes in the IsoformSwitchAnalyzeR analysis, only 136 were kept by the filters in the superintronic, none of these were selected by our rangenostics). The comparison to superintronic is not quite the same since this method is based on isoforms within genes between kits rather than our summaries over genes within just the polyA kit, however there still seems to be a lot of intron retention events by IsoformSwitchAnalyzeR.
is_res <- is_res %>%
dplyr::left_join(all_features %>%
mutate(selected = gene_id %in% hits$gene_id) %>%
dplyr::select(gene_id, selected))
dplyr::count(is_res, selected)
#> # A tibble: 2 x 2
#> selected n
#> <lgl> <int>
#> 1 FALSE 136
#> 2 NA 366